import pyspark
import pandas as pd
import numpy as np
from pyspark import SparkContext
from pyspark.sql import Row
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, StringType, LongType, IntegerType, FloatType
from pyspark.sql.functions import col, column, when, countDistinct
from pyspark.sql.functions import expr
from pyspark.sql.functions import split
from pyspark.sql.functions import udf
from datetime import datetime
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import geopandas as gpd
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt
import contextily as ctx
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler, IndexToString
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import StandardScaler, VectorAssembler
from pyspark.ml.classification import MultilayerPerceptronClassifier, DecisionTreeClassifier
from pyspark.ml.clustering import KMeans
from decision_tree_plot.decision_tree_parser import decision_tree_parse
from decision_tree_plot.decision_tree_plot import plot_trees
crime=SparkSession.builder.master("local").appName("CrimeDataAnalysis").getOrCreate()
crime.sparkContext.setCheckpointDir("~/scratch")
Data19 = crime.read.csv("./Crimes_-_2019_20231112.csv", header=True, inferSchema=True)
Data20 = crime.read.csv("./Crimes_-_2020_20231112.csv", header=True, inferSchema=True)
Data21 = crime.read.csv("./Crimes_-_2021_20231112.csv", header=True, inferSchema=True)
Data22 = crime.read.csv("./Crimes_-_2022_20231016.csv", header=True, inferSchema=True)
Data23 = crime.read.csv("./Crimes_-_2023_20231016.csv", header=True, inferSchema=True)
CombinedData = Data22.union(Data23).union(Data19).union(Data20).union(Data21)
print(f"Total Entries in 2019: {Data19.count()}")
print(f"Total Entries in 2020: {Data20.count()}")
print(f"Total Entries in 2021: {Data21.count()}")
print(f"Total Entries in 2022: {Data22.count()}")
print(f"Total Entries in 2023: {Data23.count()}")
print(f"Total Entries in 2019 to 2023: {Data19.count() + Data20.count() + Data21.count()+ Data22.count() + Data23.count()}")
Total Entries in 2019: 149558 Total Entries in 2020: 118313 Total Entries in 2021: 104746 Total Entries in 2022: 238990 Total Entries in 2023: 199486 Total Entries in 2019 to 2023: 811093
df = CombinedData.select("Date", "Block", "Primary Type", "Description", "Location Description", "Arrest", "Domestic",
"Beat", "District", "Ward", "Community Area", "Year", "Latitude","Longitude")
df_clean = df.dropna(how = 'any')
print(f"Total Entries after cleaning in 2019 to 2023: {df_clean.count()}")
Total Entries after cleaning in 2019 to 2023: 797415
mapped_primary_type = df_clean.rdd.map(lambda row: (row["Primary Type"], 1))
reduced_primary_type = mapped_primary_type.reduceByKey(lambda a, b: a + b)
sorted_primary_type = reduced_primary_type.sortBy(lambda x: x[1], ascending=False)
primary_type_counts_sorted = sorted_primary_type.collect()
primary = [] #empty list for visualization
counts = [] #empty list for visualization
for primary_type, count in primary_type_counts_sorted:
print(f"{primary_type}: {count}")
primary.append(primary_type) #the first column (primary type)
counts.append(int(count)) #the second column (counts)
THEFT: 171744 BATTERY: 148132 CRIMINAL DAMAGE: 91647 ASSAULT: 70819 MOTOR VEHICLE THEFT: 59101 OTHER OFFENSE: 50390 DECEPTIVE PRACTICE: 50341 ROBBERY: 29480 WEAPONS VIOLATION: 28668 BURGLARY: 26197 NARCOTICS: 24041 CRIMINAL TRESPASS: 16046 OFFENSE INVOLVING CHILDREN: 6390 CRIMINAL SEXUAL ASSAULT: 4280 SEX OFFENSE: 3855 PUBLIC PEACE VIOLATION: 3334 HOMICIDE: 3309 INTERFERENCE WITH PUBLIC OFFICER: 2407 ARSON: 1598 STALKING: 1211 PROSTITUTION: 1118 LIQUOR LAW VIOLATION: 635 CONCEALED CARRY LICENSE VIOLATION: 634 CRIM SEXUAL ASSAULT: 618 INTIMIDATION: 617 KIDNAPPING: 450 OBSCENITY: 151 GAMBLING: 122 HUMAN TRAFFICKING: 30 OTHER NARCOTIC VIOLATION: 20 PUBLIC INDECENCY: 19 NON-CRIMINAL: 11
plt.figure(figsize=(11, 7))
plt.barh(primary, counts)
plt.ylabel('Crime Type')
plt.xlabel('No. of Crimes')
plt.title('Crime')
plt.show()
threshold = 2
total_count = sum(counts)
new_primary = [p for p, c in zip(primary, counts) if (c / total_count) * 100 > threshold] + ['Other']
new_counts = [c for c in counts if (c / total_count) * 100 > threshold] + [sum(c for c in counts if (c / total_count) * 100 <= threshold)]
colors = plt.cm.tab20.colors
if 'Other' in new_primary:
other_index = new_primary.index('Other')
colors = list(colors)
colors[other_index] = 'Yellow'
# Create the pie chart
plt.figure(figsize=(9, 9))
plt.pie(new_counts, labels=new_primary, autopct='%1.1f%%', colors=colors)
plt.title('Crime Type Distribution')
plt.axis('equal')
plt.show()
# Map-Reduce according to respective charts
mapped_data = df_clean.rdd \
.map(lambda row: ((row['Primary Type'], row['Description'], row['Location Description']), 1))
reduced_data = mapped_data.reduceByKey(lambda a, b: a + b)
sorted_data = reduced_data.sortBy(lambda x: x[1], ascending=False)
sorted_results = sorted_data.collect()
# Lists for visualization
primary_types = []
descriptions = []
location_descriptions = []
counts = []
# Printing the top 10 categories
for category, count in sorted_results[:10]: # Limit to top 10
primary_type, description, location_description = category
print(f"{primary_type}, {description}, {location_description}: {count}")
primary_types.append(primary_type)
descriptions.append(description)
location_descriptions.append(location_description)
counts.append(int(count))
MOTOR VEHICLE THEFT, AUTOMOBILE, STREET: 34910 BATTERY, DOMESTIC BATTERY SIMPLE, APARTMENT: 32376 CRIMINAL DAMAGE, TO VEHICLE, STREET: 31436 THEFT, $500 AND UNDER, STREET: 22036 THEFT, OVER $500, STREET: 21995 BATTERY, DOMESTIC BATTERY SIMPLE, RESIDENCE: 19272 CRIMINAL DAMAGE, TO PROPERTY, APARTMENT: 14760 CRIMINAL DAMAGE, TO PROPERTY, RESIDENCE: 12319 ASSAULT, SIMPLE, APARTMENT: 10898 THEFT, RETAIL THEFT, SMALL RETAIL STORE: 10434
# Convert the results to a Pandas DataFrame for easier plotting
result_df = pd.DataFrame(list(zip(primary_types, descriptions, location_descriptions, counts)),
columns=['Primary Type', 'Description', 'Location Description', 'Count'])
# Create an interactive tree map with different colors for each primary type
fig = px.treemap(result_df,
path=['Primary Type', 'Description', 'Location Description'],
values='Count',
color='Primary Type', # Color by primary type
title='Tree Map of Crime Categories',
color_continuous_scale='Blues')
# Show the figure
fig.show()
# extracting hour and AM/PM from the 'Date' column
def extract_hour_am_pm(date_str):
try:
parts = date_str.strip().split(" ")
if len(parts) >= 3:
time_part = parts[1]
am_pm = parts[2]
hour = time_part.split(":")[0]
return f"{hour} {am_pm}"
else:
return None
except:
return None
extract_hour_am_pm_udf = udf(extract_hour_am_pm, StringType())
df_clean = df_clean.withColumn('Hour_AMPM', extract_hour_am_pm_udf(df_clean['Date']))
hourly_counts = df_clean.groupBy('Hour_AMPM').count().orderBy('Hour_AMPM')
hourly_counts_list = hourly_counts.collect()
for row in hourly_counts_list:
print(f"Hour: {row['Hour_AMPM']}, Number of Crimes: {row['count']}")
Hour: 01 AM, Number of Crimes: 26408 Hour: 01 PM, Number of Crimes: 36587 Hour: 02 AM, Number of Crimes: 23026 Hour: 02 PM, Number of Crimes: 38224 Hour: 03 AM, Number of Crimes: 19196 Hour: 03 PM, Number of Crimes: 42868 Hour: 04 AM, Number of Crimes: 15637 Hour: 04 PM, Number of Crimes: 41906 Hour: 05 AM, Number of Crimes: 13430 Hour: 05 PM, Number of Crimes: 43043 Hour: 06 AM, Number of Crimes: 14387 Hour: 06 PM, Number of Crimes: 42690 Hour: 07 AM, Number of Crimes: 19330 Hour: 07 PM, Number of Crimes: 42027 Hour: 08 AM, Number of Crimes: 26539 Hour: 08 PM, Number of Crimes: 41133 Hour: 09 AM, Number of Crimes: 33831 Hour: 09 PM, Number of Crimes: 38107 Hour: 10 AM, Number of Crimes: 34643 Hour: 10 PM, Number of Crimes: 36972 Hour: 11 AM, Number of Crimes: 35385 Hour: 11 PM, Number of Crimes: 32801 Hour: 12 AM, Number of Crimes: 53340 Hour: 12 PM, Number of Crimes: 45905
hours = [row['Hour_AMPM'] for row in hourly_counts_list]
counts = [row['count'] for row in hourly_counts_list]
# histogram
plt.figure(figsize=(10, 6))
plt.bar(hours, counts, color='green')
#titles and labels
plt.title('Crime Counts by Hour')
plt.xlabel('Hour')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()
import pandas as pd
import plotly.express as px
import numpy as np
# Generating sample data from 2019 to 2023
date_range = pd.date_range(start='2019-01-01', end='2023-12-31', freq='D')
np.random.seed(0) # For reproducibility
metric_values = np.random.randn(len(date_range)) * 100 # Random fluctuations
metric_values += np.sin(np.linspace(0, 40, len(date_range))) * 50 # Adding a sine wave pattern
data = {
'Date': date_range,
'Metric': metric_values
}
# Create a DataFrame
df = pd.DataFrame(data)
# Create an interactive time-series plot
fig = px.line(df, x='Date', y='Metric', title='Interactive Time-Series Graph from 2019 to 2023')
# Enhance the graph
fig.update_xaxes(
rangeslider_visible=True, # Add a range slider
rangeselector=dict(
buttons=list([
dict(count=7, label='1w', step='day', stepmode='backward'),
dict(count=1, label='1m', step='month', stepmode='backward'),
dict(count=6, label='6m', step='month', stepmode='backward'),
dict(count=1, label='YTD', step='year', stepmode='todate'),
dict(count=1, label='1y', step='year', stepmode='backward'),
dict(step='all')
])
)
)
# Show the figure
fig.show()
mapped_arrests = df_clean.rdd.map(lambda row: (row["Primary Type"], 1 if row["Arrest"] else 0))
reduced_arrests = mapped_arrests.reduceByKey(lambda a, b: a + b)
arrest_counts = reduced_arrests.collect()
# Sort arrest counts in descending order
N = 10
sorted_arrest_counts = sorted(arrest_counts, key=lambda x: x[1], reverse=True)[:N]
# Group other counts into an "Other" category
other_count = sum([count for crime_type, count in arrest_counts if crime_type not in [x[0] for x in sorted_arrest_counts]])
if other_count > 0:
sorted_arrest_counts.append(('Other', other_count))
crime_types, counts = zip(*sorted_arrest_counts)
colors = plt.cm.viridis(np.linspace(0, 1, N))
colors = list(colors)
colors.append('grey')
# Plotting pie chart
plt.figure(figsize=(12, 12))
wedges, texts, autotexts = plt.pie(counts, colors=colors, autopct='%1.1f%%', startangle=140, textprops=dict(color="w"))
plt.title('Percentage of Arrests by Crime Type')
plt.legend(wedges, crime_types, title="Crime Types", loc="center left", bbox_to_anchor=(1, 0, 0.5, 1))
plt.setp(autotexts, size=10, weight="bold")
plt.axis('equal')
plt.show()
mapped_primary_type_area = df_clean.rdd.map(lambda row: ((row["Primary Type"], row["Community Area"]), 1))
reduced_primary_type_area = mapped_primary_type_area.reduceByKey(lambda a, b: a + b)
sorted_primary_type_area = reduced_primary_type_area.sortBy(lambda x: x[1], ascending=False)
primary_type_area_counts_sorted = sorted_primary_type_area.collect()
# Printing top 10 results
for (primary_type, community_area), count in primary_type_area_counts_sorted[:10]:
print(f"Crime Type: {primary_type}, Community Area: {community_area}, Number of Crimes: {count}")
Crime Type: THEFT, Community Area: 8, Number of Crimes: 13165 Crime Type: THEFT, Community Area: 32, Number of Crimes: 11135 Crime Type: BATTERY, Community Area: 25, Number of Crimes: 9630 Crime Type: THEFT, Community Area: 28, Number of Crimes: 9489 Crime Type: THEFT, Community Area: 24, Number of Crimes: 7765 Crime Type: THEFT, Community Area: 6, Number of Crimes: 6943 Crime Type: BATTERY, Community Area: 43, Number of Crimes: 6296 Crime Type: THEFT, Community Area: 25, Number of Crimes: 6149 Crime Type: BATTERY, Community Area: 29, Number of Crimes: 5658 Crime Type: CRIMINAL DAMAGE, Community Area: 25, Number of Crimes: 5045
primary_type_area_counts_sorted = sorted_primary_type_area.collect()
# Creating a dataframe with the collected data
data = {'Crime Type': [], 'Community Area': [], 'Count': []}
for (primary_type, community_area), count in primary_type_area_counts_sorted:
data['Crime Type'].append(primary_type)
data['Community Area'].append(community_area)
data['Count'].append(count)
df_heatmap = pd.DataFrame(data)
# Heatmap
heatmap_data = df_heatmap.pivot("Community Area", "Crime Type", "Count")
# Graph
plt.figure(figsize=(25, 20))
sns.heatmap(heatmap_data, annot=False, cmap="Spectral", cbar_kws={'label': 'Number of Crimes'})
plt.title('Crime Counts by Community Area and Crime Type')
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()
# Sample data generation
np.random.seed(0)
num_samples = 100
data = {
'Latitude': np.random.uniform(41.6400, 42.0230, num_samples),
'Longitude': np.random.uniform(-87.9400, -87.5200, num_samples),
'Crime Count': np.random.randint(1, 100, num_samples),
'Crime Type': np.random.choice(['Theft', 'Assault', 'Burglary', 'Vandalism'], num_samples),
'Date': pd.date_range(start='2021-01-01', periods=num_samples, freq='D').strftime('%Y-%m-%d')
}
df = pd.DataFrame(data)
# Plotting
fig = px.scatter_mapbox(df, lat="Latitude", lon="Longitude", color="Crime Type",
size="Crime Count", hover_name="Crime Type",
hover_data=["Date", "Crime Count"],
color_continuous_scale=px.colors.cyclical.IceFire, size_max=15, zoom=10,
mapbox_style="carto-positron")
# Enhance layout
fig.update_layout(title='Enhanced Crime Activities Map in Chicago', mapbox=dict(center=dict(lat=41.8781, lon=-87.6298), zoom=9))
fig.show()
# ---- Data Preparation ----
# Define your columns
feature_cols = ['Beat', 'District', 'Ward', 'Community Area', 'Hour_AMPM'] # Add more features as needed
label_col = 'Primary Type'
# Create indexers for categorical columns and label column
indexers = [StringIndexer(inputCol=col, outputCol=col+"_index").fit(df_clean) for col in set(feature_cols)]
label_indexer = StringIndexer(inputCol=label_col, outputCol=label_col + "_index").fit(df_clean)
# Assemble features
assembler = VectorAssembler(inputCols=[indexer.getOutputCol() for indexer in indexers], outputCol="features")
# Define and apply the pipeline
pipeline = Pipeline(stages=indexers + [label_indexer, assembler])
prepared_df = pipeline.fit(df_clean).transform(df_clean)
# ---- Model Training and Evaluation ----
# Split the data
train_data, test_data = prepared_df.randomSplit([0.8, 0.2], seed=1234)
# Create and train the Decision Tree model
dtc = DecisionTreeClassifier(labelCol=label_col + "_index", featuresCol="features", maxBins=300)
dtc_model = dtc.fit(train_data)
# Make predictions
predictions = dtc_model.transform(test_data)
# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol=label_col + "_index", predictionCol="prediction")
accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
precision = evaluator.evaluate(predictions, {evaluator.metricName: "precisionByLabel"})
recall = evaluator.evaluate(predictions, {evaluator.metricName: "recallByLabel"})
f1 = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})
# ---- Print Relevant Information ----
# Model Summary
print(f"Number of nodes in the decision tree: {dtc_model.numNodes}")
print(f"Depth of the decision tree: {dtc_model.depth}")
# Decision Tree Structure
print("\nDecision Tree Model:")
print(dtc_model.toDebugString)
# Evaluation Metrics
print("\nEvaluation Metrics:")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
Number of nodes in the decision tree: 23
Depth of the decision tree: 5
Decision Tree Model:
DecisionTreeClassificationModel: uid=DecisionTreeClassifier_7f31b6119432, depth=5, numNodes=23, numClasses=32, numFeatures=5
If (feature 4 in {0.0,4.0,6.0,10.0,22.0,25.0,26.0,38.0,41.0,50.0,51.0,54.0,61.0,67.0,69.0,71.0,89.0,90.0,97.0,101.0,108.0,113.0,120.0,123.0,126.0,133.0,144.0,159.0,160.0,161.0,162.0,171.0,172.0,173.0,177.0,179.0,188.0,199.0,200.0,211.0,218.0,219.0,223.0,224.0,226.0,229.0,233.0,243.0,247.0,249.0,250.0,254.0,258.0,259.0,262.0,263.0,266.0,268.0,271.0,272.0,273.0,274.0})
Predict: 0.0
Else (feature 4 not in {0.0,4.0,6.0,10.0,22.0,25.0,26.0,38.0,41.0,50.0,51.0,54.0,61.0,67.0,69.0,71.0,89.0,90.0,97.0,101.0,108.0,113.0,120.0,123.0,126.0,133.0,144.0,159.0,160.0,161.0,162.0,171.0,172.0,173.0,177.0,179.0,188.0,199.0,200.0,211.0,218.0,219.0,223.0,224.0,226.0,229.0,233.0,243.0,247.0,249.0,250.0,254.0,258.0,259.0,262.0,263.0,266.0,268.0,271.0,272.0,273.0,274.0})
If (feature 0 in {10.0,15.0,17.0,18.0,19.0,20.0,21.0,22.0,23.0})
If (feature 0 in {15.0,17.0,18.0,20.0,21.0,23.0})
Predict: 1.0
Else (feature 0 not in {15.0,17.0,18.0,20.0,21.0,23.0})
If (feature 0 in {19.0,22.0})
If (feature 2 in {1.0,2.0,8.0,17.0,25.0,26.0,30.0,33.0,35.0,36.0,37.0,38.0,43.0,44.0,45.0,50.0,52.0,58.0,59.0,60.0,61.0,62.0,66.0,67.0,68.0,71.0,72.0,73.0,75.0})
Predict: 0.0
Else (feature 2 not in {1.0,2.0,8.0,17.0,25.0,26.0,30.0,33.0,35.0,36.0,37.0,38.0,43.0,44.0,45.0,50.0,52.0,58.0,59.0,60.0,61.0,62.0,66.0,67.0,68.0,71.0,72.0,73.0,75.0})
Predict: 1.0
Else (feature 0 not in {19.0,22.0})
Predict: 1.0
Else (feature 0 not in {10.0,15.0,17.0,18.0,19.0,20.0,21.0,22.0,23.0})
If (feature 2 in {1.0,2.0,4.0,8.0,17.0,23.0,25.0,26.0,30.0,32.0,33.0,34.0,35.0,37.0,38.0,42.0,43.0,44.0,45.0,48.0,49.0,50.0,51.0,52.0,55.0,58.0,60.0,62.0,64.0,67.0,68.0,71.0,72.0,73.0,74.0,76.0})
If (feature 0 in {1.0,14.0})
If (feature 4 in {18.0,85.0,99.0,107.0,111.0,115.0,136.0,143.0,148.0,149.0,180.0,190.0,214.0,220.0,222.0,235.0,240.0,255.0,265.0,269.0,270.0})
Predict: 6.0
Else (feature 4 not in {18.0,85.0,99.0,107.0,111.0,115.0,136.0,143.0,148.0,149.0,180.0,190.0,214.0,220.0,222.0,235.0,240.0,255.0,265.0,269.0,270.0})
Predict: 0.0
Else (feature 0 not in {1.0,14.0})
Predict: 0.0
Else (feature 2 not in {1.0,2.0,4.0,8.0,17.0,23.0,25.0,26.0,30.0,32.0,33.0,34.0,35.0,37.0,38.0,42.0,43.0,44.0,45.0,48.0,49.0,50.0,51.0,52.0,55.0,58.0,60.0,62.0,64.0,67.0,68.0,71.0,72.0,73.0,74.0,76.0})
If (feature 3 in {0.0,2.0,3.0,5.0,6.0,9.0,10.0,13.0,15.0,17.0,22.0})
If (feature 4 in {8.0,29.0,45.0,70.0,109.0,117.0,124.0,180.0,185.0,248.0})
Predict: 0.0
Else (feature 4 not in {8.0,29.0,45.0,70.0,109.0,117.0,124.0,180.0,185.0,248.0})
Predict: 1.0
Else (feature 3 not in {0.0,2.0,3.0,5.0,6.0,9.0,10.0,13.0,15.0,17.0,22.0})
If (feature 4 in {8.0,19.0,23.0,27.0,109.0,134.0})
Predict: 10.0
Else (feature 4 not in {8.0,19.0,23.0,27.0,109.0,134.0})
Predict: 1.0
Evaluation Metrics:
Accuracy: 0.2590360307593478
Precision: 0.32553083675237426
Recall: 0.5811374711838688
F1 Score: 0.1603358241467655
# Selecting relevant features for clustering
feature_cols = ['Latitude', 'Longitude'] # Assuming you have Latitude and Longitude in your data
# Vector assembler
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
# Transform data
vectorized_data = assembler.transform(df_clean).select('features')
# Number of clusters
k = 5 # Adjust based on your data
# Train the model
kmeans = KMeans().setK(k).setSeed(1)
model = kmeans.fit(vectorized_data)
# Make predictions
predictions = model.transform(vectorized_data)
# Evaluate clustering by computing Silhouette score
from pyspark.ml.evaluation import ClusteringEvaluator
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))
# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
print(center)
Silhouette with squared euclidean distance = 0.5478207645024277 Cluster Centers: [ 41.77798889 -87.68658361] [ 41.9682436 -87.67862659] [ 41.90306264 -87.74420991] [ 41.87448533 -87.64517502] [ 41.7412362 -87.60521981]
# Convert DataFrame to GeoDataFrame
gdf = gpd.GeoDataFrame(
df, geometry=gpd.points_from_xy(df.Longitude, df.Latitude))
# Ensure the data uses a coordinate system that contextily can interpret (Web Mercator)
gdf = gdf.set_crs("EPSG:4326") # Assuming your data is in WGS84
gdf = gdf.to_crs(epsg=3857) # Convert to Web Mercator
# DBSCAN clustering
dbscan = DBSCAN(eps=0.001, min_samples=10) # Tune these parameters
gdf['cluster'] = dbscan.fit_predict(gdf[['Longitude', 'Latitude']])
# Plotting
fig, ax = plt.subplots(figsize=(10, 10))
gdf.plot(column='cluster', categorical=True, legend=True, markersize=45, cmap="tab20", ax=ax)
# Add basemap
ctx.add_basemap(ax, source=ctx.providers.CartoDB.Positron)
# Customize the appearance
ax.set_title('DBSCAN Clustering of Crime Data in Chicago', fontdict={'fontsize': '15', 'fontweight' : '3'})
ax.set_axis_off()
plt.show()
crime.stop()